# -*- coding: utf-8 -*-
import scrapy
import re, json
from ..items import JinritoutiaoItem


class JrttSpider(scrapy.Spider):
    name = 'jrtt'
    # allowed_domains = ['xxx']
    start_urls = ['https://s3a.pstatp.com/toutiao/static/js/page/index_node/index.43431e8f7c9b9140d114.js']

    def parse(self, response):
        data = re.findall('items:(.*?)}}},components', response.text, re.S)[0]
        types = re.findall('name:"(.*?)",url:"(.*?)"', data, re.S)
        for tt in types:
            name, url = tt
            if url.startswith('//'):
                url = 'https:' + url
            if url.startswith('/'):
                url = 'https://www.toutiao.com' + url
            item = JinritoutiaoItem()
            item['type_name'] = name
            item['type_url'] = url
        news_url = 'https://www.toutiao.com/api/pc/focus/'
        yield scrapy.Request(url=news_url, callback=self.parse_news, meta={'item': item})

    def parse_news(self, response):
        news = json.loads(response.text)['data']['pc_feed_focus']
        for new in news:
            title = new['title']
            href = 'https://www.toutiao.com' + new['display_url']
            image_src = 'https:' + new['image_url']
            print(title, href)
            item = response.meta['item']
            item['news_title'] = title
            item['news_href'] = href
            item['news_img'] = image_src
            yield item
